Streaming Data Management and Time Series Analysis Final Project

Modelli Arima

library(KFAS)
library(xts)
library(forecast)
library(Metrics)
library(reticulate)
use_python("/usr/local/python")

Caricamento della serie storica e conversione dei dati in oggetti xts

dt <- read.csv("csv_weights/time_series_dataset.csv", sep = ";")
y <- dt$value
y <- xts(y, as.Date(as.character(dt$Data), format = "%Y-%m-%d"))
plot(y)

Confronto media e stdev per valutare la trasformazione logaritmica per sistemare la stazionarietà in varianza.

means <- as.numeric(apply.monthly(y, mean))
stdev <- as.numeric(apply.monthly(y, sd))
plot(means, stdev)
abline(lm(stdev ~ means), col = "red")

Trasformazione di BoxCox per risolvere la non stazionarietà in varianza

y_box <- BoxCox(y, lambda = "auto")
lambda <- BoxCox.lambda(y)
y_train <- y_box["2010-01-01/2017-12-31"]
y_train_num <- as.numeric(y_train)
y_val <- y_box["2018-01-01/2018-12-31"]
y_val_num <- as.numeric(y_val)
#write.csv(y_box, "time_series_trasformed.csv")
ggtsdisplay(y_train, lag.max = 100)

ggtsdisplay(y_train, lag.max = 1000)

I grafici vengono plottati con due diversi lag.max in modo da cogliere sia gli andamenti a breve termine che a lungo termine. Da acf può notare una discesa lineare e la presenza di stagionalità settimanale e annua Da pacf si può notare una discesa geometrica ogni 7 giorni Guardando i grafici si può cogliere un trend che rende la serie non stazionaria in media. Il trend sembra essere inizialmente descrescente e poi crescente. Eseguiamo le due integrazioni:

mod1 <- Arima(y_train, order = c(0,1,0), seasonal = list(order = c(0,1,0), period= 7))
ggtsdisplay(mod1$residuals, lag.max = 50) 

ggtsdisplay(mod1$residuals, lag.max = 1000)

Guardando la Pacf sembra esserci sia una componente AR(p)MA(p) che una componente SMA(7)

mod2 <- Arima(y_train, order = c(1,1,1), seasonal = list(order = c(0,1,1), period= 7))
ggtsdisplay(mod2$residuals, lag.max = 100)

#ggtsdisplay(mod2$residuals, lag.max = 1000)

I grafici suggeriscono l’inserimento di un SAR(7)

mod3 <- Arima(y_train, order = c(1,1,1), seasonal = list(order = c(1,1,1), period= 7))
checkresiduals(mod3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(1,1,1)[7]
## Q* = 21.54, df = 6, p-value = 0.001467
## 
## Model df: 4.   Total lags used: 10
ggtsdisplay(mod3$residuals, lag.max = 100) 

ggtsdisplay(mod3$residuals, lag.max = 1000)

I lag sembrano essere sotto le bande al 95%. Rimane da gestirle la stagionalità annua e si può effettuare una sorta di grid search per aumentare la bontà del modello che tenga in considerazione il numero di termini di fourier per cogliere l’informazione intra-annua e l’ordine di AR. La scelta del modell migliore viene fatta guardando l’AIC.

i <- 1
harm <- c(1,5,10)
best_arima <- data.frame(p = rep(NA,18), k = rep(NA,18), AIC = rep(NA,18))
for(p in seq(1:6)) {

  for (k in harm) {
    fs_train <- fourier(ts(y_train, frequency = 365), K = k)
    fs_test <- fourier(ts(y_train, frequency = 365), K = k, h = 365)
    an.error.occured <- FALSE
    tryCatch( { model <- Arima(y_train, c(p,1,1), list(order=c(1,1,1), period=7),
                                xreg=fs_train, include.constant = TRUE); print(model$aic) },
              error = function(e) {an.error.occured <<- TRUE})

    if(an.error.occured == FALSE) {
      best_arima[i,] <- c(p, k, model$aic)
    }
    i <- i + 1
  }

}
best_arima
write.csv(best_arima,"csv_weights/best_arima.csv", row.names = FALSE)
best_arima <- read.csv("csv_weights/best_arima.csv")
best_arima_config <- best_arima[which.min(best_arima$AIC),]
best_arima_config
##    p  k      AIC
## 18 6 10 22012.17
fs_train <- fourier(ts(y_train, frequency = 365), K = best_arima_config$k)
fs_test <- fourier(ts(y_train, frequency = 365), K = best_arima_config$k, h = 365)

mod4 <- Arima(y_train, c(best_arima_config$p,1,1), list(order=c(1,1,1), period=7), 
                  xreg=fs_train, include.constant = TRUE)

pred4<- forecast(mod4, h=365, xreg=fs_test)
autoplot(pred4)

print(paste0("MAPE on training set ", mean(abs(mod4$fitted - y_train_num)/y_train_num) * 100))
## [1] "MAPE on training set 8.99792859533606"
print(paste0("MAPE on validation set ", mean(abs(pred4$mean - y_val_num)/y_val_num) * 100)) 
## [1] "MAPE on validation set 10.9370875875956"
plot(y_val_num, type = "l",)
lines(as.numeric(pred4$mean), col = "blue")

Inserimento delle festività tramite dummy:

library(dsa)

dm <- data.frame(Date = seq(from = as.Date("2010-01-01"), to = as.Date("2019-12-31"), by = 1))

holidays <- c("EasterMon","EasterTue","EasterSat","Nov1","Dec24","Dec25","Dec26","Dec31","Jan1","Jan6","Dec8","Aug15","Apr25","Mag1","Giu2")
#timeDate::listHolidays()
EasterMon <- as.Date(c(timeDate::EasterMonday(2010:2019)))
EasterTue <- EasterMon + 1
EasterSat <- EasterMon - 2
Nov1 <-  as.Date(c(timeDate::AllSaints(2010:2019)))
Dec24 <- as.Date(c(timeDate::ChristmasEve(2010:2019)))
Dec25 <- Dec24 + 1
Dec26 <- Dec25 + 1
Dec31 <- Dec26 + 5
Jan1 <- Dec26 + 6 - 365
Jan6 <- Jan1 + 5
Dec8 <- as.Date(c(timeDate::ITImmaculateConception(2010:2019)))
Aug15 <- as.Date(c(timeDate::ITAssumptionOfVirginMary(2010:2019)))
Apr25 <- as.Date(c(timeDate::ITLiberationDay(2010:2019)))
Mag1 <- Apr25 + 6
Giu2 <- Mag1 + 32

for( i in holidays) {
  holy <- rep(0,length(dm$Date))
  dm <- cbind(dm, holy)
}
colnames(dm) <- c("Date",holidays)

for( j in seq(10)) {
  dm$EasterTue[which(dm$Date == EasterTue[j])] <- 1
  dm$EasterMon[which(dm$Date == EasterMon[j])] <- 1
  dm$EasterSat[which(dm$Date == EasterSat[j])] <- 1
  dm$Nov1[which(dm$Date == Nov1[j])] <- 1
  dm$Dec24[which(dm$Date == Dec24[j])] <- 1
  dm$Dec25[which(dm$Date == Dec25[j])] <- 1
  dm$Dec26[which(dm$Date == Dec26[j])] <- 1
  dm$Dec31[which(dm$Date == Dec31[j])] <- 1
  dm$Jan1[which(dm$Date == Jan1[j])] <- 1
  dm$Jan6[which(dm$Date == Jan6[j])] <- 1
  dm$Dec8[which(dm$Date == Dec8[j])] <- 1
  dm$Aug15[which(dm$Date == Aug15[j])] <- 1
  dm$Apr25[which(dm$Date == Apr25[j])] <- 1
  dm$Mag1[which(dm$Date == Mag1[j])] <- 1
  dm$Giu2[which(dm$Date == Giu2[j])] <- 1
}

hd <- rowSums(dm[holidays])
dm <- cbind(dm, hd)

dm$weekday <- weekdays(as.Date(dm$Date))
dm[which(dm$weekday == "Domenica"),][holidays] <- c(rep(0,15))
write.csv(dm,"csv_weights/dummy_project.csv", row.names = FALSE)
holidays <- c("EasterMon","EasterTue","EasterSat","Nov1","Dec24","Dec25","Dec26","Dec31","Jan1","Jan6","Dec8","Aug15","Apr25","Mag1","Giu2")
dm <-read.csv("csv_weights/dummy_project.csv")

fs_train <- fourier(ts(y_train, frequency = 365), K = best_arima_config$k)
fs_test <- fourier(ts(y_train, frequency = 365), K = best_arima_config$k, h = 365)
dummy_holidays_train <- dm[holidays][1:length(y_train),]
dummy_holidays_val <- dm[holidays][(length(y_train)+1):(length(y_train)+365),]

mod4_reg <- Arima(y_train, c(best_arima_config$p,1,1), list(order=c(1,1,1), period=7), 
                  xreg=as.matrix(cbind(fs_train,dummy_holidays_train)), include.constant = TRUE)

#checkresiduals(mod4_reg)
pred4_reg <- forecast(mod4_reg, h=365, xreg=as.matrix(cbind(fs_test,dummy_holidays_val)))
#autoplot(pred4_reg)

print(paste0("MAPE on training set ", mean(abs(mod4_reg$fitted - y_train_num)/y_train_num) * 100))
## [1] "MAPE on training set 8.45300064992593"
print(paste0("MAPE on validation set ", mean(abs(pred4_reg$mean - y_val_num)/y_val_num) * 100)) 
## [1] "MAPE on validation set 10.918625828688"
# plot validation
plot(y_val_num, type = "l")
lines(as.numeric(pred4_reg$mean), col = "blue")

Proviamo a far variare k tenendo lo stesso modello e le dummy delle festività

i <- 1
harm <- c(0,1,2,4,6,10)
best_arima_reg <- data.frame(k = rep(NA,6), AIC = rep(NA,6), Mape_val = rep(NA,6))

for (k in harm) {
  an.error.occured <- FALSE
  if(k != 0) {
    fs_train <- fourier(ts(y_train, frequency = 365), K = k)
    fs_test <- fourier(ts(y_train, frequency = 365), K = k, h = 365)
    tryCatch( { model <- Arima(y_train, c(best_arima_config$p,1,1), list(order=c(1,1,1), period=7),
                                xreg=as.matrix(cbind(fs_train,dummy_holidays_train)), include.constant = TRUE); print(model$aic) },
              error = function(e) {an.error.occured <<- TRUE})
    prediction <- forecast(model, h=365, xreg=as.matrix(cbind(fs_test,dummy_holidays_val)))
  } else {

    model <- Arima(y_train, c(best_arima_config$p,1,1), list(order=c(1,1,1), period=7),
                                xreg=as.matrix(dummy_holidays_train), include.constant = TRUE)
    prediction <- forecast(model, h=365, xreg=as.matrix(dummy_holidays_val))
  }

  if(an.error.occured == FALSE) {
    
    mape_val <- mean(abs(prediction$mean - y_val_num)/y_val_num) * 100
    best_arima_reg[i,] <- c(k, model$aic, mape_val)
  }
  i <- i + 1
}

best_arima_reg
write.csv(best_arima_reg,"csv_weights/best_arima_reg.csv", row.names = FALSE)
best_arima_reg <- read.csv("csv_weights/best_arima_reg.csv")
best_arima_reg_config <- best_arima_reg[which.min(best_arima_reg$Mape_val),]
best_arima_reg_config
##   k      AIC Mape_val
## 2 1 21613.34 10.08227

Miglior modello arima con regressori dummy per la festività e K = 1 termini di Fourier

fs_train <- fourier(ts(y_train, frequency = 365), K = best_arima_reg_config$k)
fs_test <- fourier(ts(y_train, frequency = 365), K = best_arima_reg_config$k, h = 365)
dummy_holidays_train <- dm[holidays][1:length(y_train),]
dummy_holidays_val <- dm[holidays][(length(y_train)+1):(length(y_train)+365),]

mod5_reg <- Arima(y_train, c(best_arima_config$p,1,1), list(order=c(1,1,1), period=7), 
                  xreg=as.matrix(cbind(fs_train,dummy_holidays_train)), include.constant = TRUE)
#mod5_reg
#checkresiduals(mod5_reg)
pred5_reg <- forecast(mod5_reg, h=365, xreg=as.matrix(cbind(fs_test,dummy_holidays_val)))
autoplot(pred5_reg)

print(paste0("MAPE on training set ", mean(abs(mod5_reg$fitted - y_train_num)/y_train_num) * 100))
## [1] "MAPE on training set 8.53247472403642"
print(paste0("MAPE on validation set ", mean(abs(pred5_reg$mean - y_val_num)/y_val_num) * 100)) 
## [1] "MAPE on validation set 10.0822667905118"
# plot totale
plot(as.numeric(y_box), type ="l")
lines(pred5_reg$fitted, col = "red")
lines(pred5_reg$mean, col = "blue")

# plot validation
plot(y_val_num, type = "l")
lines(as.numeric(pred4$mean), col = "blue")
lines(as.numeric(pred5_reg$mean), col = "red")
legend(280, 160, legend=c("whitout dummy", "with dummy"), col=c("blue", "red"), lty=1:1, cex=0.8)

checkresiduals(mod4_reg)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(6,1,1)(1,1,1)[7] errors
## Q* = 78.634, df = 3, p-value < 2.2e-16
## 
## Model df: 44.   Total lags used: 47
checkresiduals(mod5_reg)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(6,1,1)(1,1,1)[7] errors
## Q* = 54.442, df = 3, p-value = 9.033e-12
## 
## Model df: 26.   Total lags used: 29

Modelli UCM

Date le caratteristiche della serie storica sono stati provati due modelli UCM: - RW con stagionalità settimanale (dummy) e annuale (trigonometrica) - IRW con stagionalità settimale (dummy) e annuale (trigonometrica)

Su entrambi i modelli è stato effettuato un grid search per scegliere il numero di armoniche migliore

y_ucm <- y_train_num
y_ucm[(length(y_train)+1):(nrow(dt))] <- NA 
#plot(y_ucm, type ="l")

best_ucm <- data.frame(Name = rep(NA,8), Mape_train = rep(NA,8), Mape_val = rep(NA,8))
harm <- c(2,6,10,12,14)
vary <- var(y_ucm, na.rm = TRUE)

updt_for <- function(pars, model, k, m) {
  model$Q[1, 1, 1] <- exp(pars[1])
  model$Q[2, 2, 1] <- exp(pars[2])
  if (m == 1)
    model$Q[3, 3, 1] <- exp(pars[3])
  
  diag(model$Q[(3+m):(2+(k*2)+m),(3+m):(2+(k*2)+m), 1]) <- exp(pars[3+m])
  model$H[1,1,1] <- exp(pars[4+m])
  
  model
}
i <- 1
for (m in 0:1) {

  for(k in harm){

    if (m == 0) {
      model <- SSModel(y_ucm ~ SSMtrend(1, NA) + SSMseasonal(7, NA, "dummy") + SSMseasonal(365, NA, "trig", harmonics = 1:k), H = NA)
      name <- paste0("RW + seas_7  + seas_365 - k = ",k)
      init <- numeric(4)
    }
    else {
      model <- SSModel(y_ucm ~ SSMtrend(2, list(0,NA)) + SSMseasonal(7, NA, "dummy") + SSMseasonal(365, NA, "trig", harmonics = 1:k), H = NA)
      init <- numeric(5)
      init[1] <- 0
      name <- paste0("IRW + seas_7  + seas_365 - k = ",k)
    }

    model$P1inf <- model$P1inf * 0 # trasformata in matrice di zero
    model$a1[1] <- mean(y_ucm, na.rm = TRUE) # media del livello  (media più sensata)
    diag(model$P1) <- vary # a tutta la diagonale mettiamo la varianza della serie storica (upper bound)

    init[(1+m)] <- log(vary) # log-var(dist.eta)
    init[(2+m)] <- log(vary/10) # log-var(dist.seas) dummy
    init[(3+m)] <- log(vary/10) # log-var(dist.seas) trig
    init[(4+m)] <- log(vary) # log-var(err.oss)

    fit <- fitSSM(model, init, updt_for, update_args = c(k = k, m = m))
    smo <- KFS(fit$model, smoothing = c("state","mean"))

    mape_train <- mean(abs(smo$muhat[1:(length(y_train))] - y_train_num)/y_train_num) * 100
    mape_val <- mean(abs(smo$muhat[(length(y_train)+1):(nrow(dt))] - y_val_num)/y_val_num) * 100

    best_ucm[i,] <- c(name, mape_train, mape_val)
    i <- i + 1
  }
}

write.csv(best_ucm,"csv_weights/best_ucm.csv", row.names = FALSE)
best_ucm <- read.csv("csv_weights/best_ucm.csv")
best_ucm[which.min(best_ucm$Mape_val),]
##                               Name Mape_train Mape_val
## 5 RW + seas_7  + seas_365 - k = 14   7.299592 15.18734
mod2_ucm <- SSModel(y_ucm ~ SSMtrend(1, NA) + 
                      SSMseasonal(7, NA, "dummy") + 
                      SSMseasonal(365, NA, "trig", harmonics = 1:14), H = NA)
init <- numeric(4)
init[1] <- log(vary) # log-var(dist.eta)
init[2] <- log(vary/10) # log-var(dist.seas) dummy
init[3] <- log(vary/10) # log-var(dist.seas) trig
init[4] <- log(vary)  # log-var(err.oss)

mod2_ucm$P1inf <- mod2_ucm$P1inf * 0 
mod2_ucm$a1[1] <- mean(y_ucm, na.rm = TRUE) 
diag(mod2_ucm$P1) <- vary 
fit2 <- fitSSM(mod2_ucm, init, updt_for, update_args = c(k = 14, m = 0))
#fit2$optim.out$convergence
smo2 <- KFS(fit2$model, smoothing = c("state","mean"))

print(paste0("MAPE on training set ", mean(abs(smo2$muhat[1:(length(y_train))] - y_train_num)/y_train_num) * 100))
## [1] "MAPE on training set 7.29959174329699"
print(paste0("MAPE on validation set ", mean(abs(smo2$muhat[(length(y_train)+1):(nrow(dt))] - y_val_num)/y_val_num) * 100)) 
## [1] "MAPE on validation set 15.1873440009321"
# plot validation
plot(y_val_num, type = "l")
lines(smo2$muhat, col = "blue")

Proviamo ad inserire nel modello le dummy per le festività

dummy_holidays_ucm <- dm[holidays][1:length(y_ucm),]
mod3_ucm <- SSModel(y_ucm ~ dummy_holidays_ucm$EasterSat +
                      dummy_holidays_ucm$EasterMon +
                      dummy_holidays_ucm$EasterTue +
                      dummy_holidays_ucm$Dec24 +
                      dummy_holidays_ucm$Dec25 +
                      dummy_holidays_ucm$Dec26 +
                      dummy_holidays_ucm$Dec31 +
                      dummy_holidays_ucm$Dec8 +
                      dummy_holidays_ucm$Jan1 +
                      dummy_holidays_ucm$Nov1 +
                      dummy_holidays_ucm$Jan6 +
                      dummy_holidays_ucm$Aug15 +
                      dummy_holidays_ucm$Apr25 +
                      dummy_holidays_ucm$Mag1 +
                      dummy_holidays_ucm$Giu2 +
                      SSMtrend(1, NA) + SSMseasonal(7, NA, "dummy") + SSMseasonal(365, NA, "trig", harmonics = 1:14), H = NA)

init <- numeric(4)
init[1] <- log(vary) # log-var(dist.eta)
init[2] <- log(vary/10) # log-var(dist.seas) dummy
init[3] <- log(vary/10) # log-var(dist.seas) trig
init[4] <- log(vary)  # log-var(err.oss)

mod3_ucm$P1inf <- mod3_ucm$P1inf * 0 
mod3_ucm$a1[1] <- mean(y_ucm, na.rm = TRUE) 
diag(mod3_ucm$P1) <- vary 
fit3 <- fitSSM(mod3_ucm, init, updt_for, update_args = c(k = 14, m = 0))

smo3 <- KFS(fit3$model, smoothing = c("state","mean"))

print(paste0("MAPE on training set ", mean(abs(smo3$muhat[1:(length(y_train))] - y_train_num)/y_train_num) * 100))
## [1] "MAPE on training set 6.51449913711938"
print(paste0("MAPE on validation set ", mean(abs(smo3$muhat[(length(y_train)+1):(nrow(dt))] - y_val_num)/y_val_num) * 100))
## [1] "MAPE on validation set 20.9862678179542"
# plot validation
plot(y_val_num, type = "l")
lines(smo3$muhat, col = "blue")

L’aggiunta dei regressori peggiora le performance del modello. Come si può vedere dal Mape del training set, il modello overfitta.

Modelli non lineari - GRU


import numpy as np
import pandas as pd
import math
from keras.models import Sequential
## Using TensorFlow backend.
from keras.layers import Dense
from keras.layers import LSTM, GRU, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


def create_window(dataset, look_back):
    X = list()
    Y = list()
    for i in range((len(dataset)-look_back)):
            X.append(dataset[i:i+look_back])
            Y.append(dataset[i+look_back])
    return np.array(X), np.array(Y)
    
def GRU_model(look_back):
  model = Sequential()
  model.add(GRU(64, dropout = 0.1, recurrent_dropout = 0.5, input_shape=(1, look_back), return_sequences=True))
  model.add(GRU(128, dropout = 0.1, recurrent_dropout = 0.5,activation="relu"))
  model.add(Dense(1))
  print(model.summary())
  model.compile(loss='mean_squared_error', optimizer='Adam')
  return model
 
dataset = r.y_box
dataset = dataset.astype('float32')

# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = np.array(dataset)
dataset_scaled = scaler.fit_transform(dataset.reshape(-1,1))

train_size = len(dataset) - 365
val_size = len(dataset) - train_size
train_scaled, val_scaled = dataset_scaled[0:train_size,:], dataset_scaled[train_size:len(dataset_scaled),:]
    
batch_size = 1
look_back = 365
trainX_scaled, trainY_scaled = create_window(train_scaled, look_back)
trainX_scaled = np.reshape(trainX_scaled, (trainX_scaled.shape[0], 1, trainX_scaled.shape[1]))

model = GRU_model(look_back)
#model.fit(trainX_scaled, trainY_scaled, epochs=20, batch_size=1, verbose=1)
## Model: "sequential_1"
## _________________________________________________________________
## Layer (type)                 Output Shape              Param #   
## =================================================================
## gru_1 (GRU)                  (None, 1, 64)             82560     
## _________________________________________________________________
## gru_2 (GRU)                  (None, 128)               74112     
## _________________________________________________________________
## dense_1 (Dense)              (None, 1)                 129       
## =================================================================
## Total params: 156,801
## Trainable params: 156,801
## Non-trainable params: 0
## _________________________________________________________________
## None
model.load_weights("csv_weights/model_GRU_stacked.h5")

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
trainPredict_scaled = model.predict(trainX_scaled, batch_size=batch_size)
trainPredict = scaler.inverse_transform(trainPredict_scaled)

trainY = scaler.inverse_transform(trainY_scaled)

trainScore = mean_absolute_percentage_error(trainY, trainPredict)
print('Train Score: %.2f MAPE' % (trainScore))
## Train Score: 9.60 MAPE
valX = trainX_scaled[len(trainX_scaled)-1]
valX = np.append(valX[0], trainY_scaled[len(trainY)-1][0])
val_pred = []
for i in range(365):
  valX_i = valX[-365:].reshape(1,1,-1)
  prediction = model.predict(valX_i, batch_size=batch_size)
  valX = np.append(valX_i, prediction)
  val_pred.append(prediction)

valPredict = scaler.inverse_transform(np.array(val_pred).reshape(-1,1))
Yval = r.y_val
valScore = mean_absolute_percentage_error(Yval, valPredict)
print('Val Score: %.2f MAPE' % (valScore))
## Val Score: 10.87 MAPE
# plot validation
plot(y_val_num, type = "l")
lines(as.numeric(py$valPredict), col = "blue")

# plot training
plot(y_train_num[366:(length(y_train_num))], type = "l")
lines(as.numeric(py$trainPredict), col = "blue")

Applicazione dei tre modelli sui dati di test

Modello ARIMA

fs_train_final <- fourier(ts(y_box, frequency = 365), K = 1)
fs_test_final <- fourier(ts(y_box, frequency = 365), K = 1, h = 334)

dummy_holidays_train_final <- dm[holidays][1:length(y),]
dummy_holidays_test_final <- dm[holidays][(length(y)+1):(length(y)+334),]

mod5_reg_final <- Arima(y_box, c(6,1,1), list(order=c(1,1,1), period=7), 
                  xreg=as.matrix(cbind(fs_train_final,dummy_holidays_train_final)), include.constant = TRUE)
#mod5_reg
#checkresiduals(mod5_reg)
pred5_reg_final <- forecast(mod5_reg_final, h=334, xreg=as.matrix(cbind(fs_test_final,dummy_holidays_test_final)))
print(paste0("MAPE on training set ", mean(abs(mod5_reg_final$fitted - as.numeric(y_box))/as.numeric(y_box)) * 100))
## [1] "MAPE on training set 8.37480241207164"
plot(as.numeric(pred5_reg_final$mean), type = "l")

#autoplot(pred5_reg_final)

Modello UCM

y_ucm_final <- as.numeric(y_box)
y_ucm_final[(length(y_ucm_final)+1):(length(y_ucm_final)+334)] <- NA 

updt_for <- function(pars, model, k, m) {
  model$Q[1, 1, 1] <- exp(pars[1])
  model$Q[2, 2, 1] <- exp(pars[2])
  if (m == 1)
    model$Q[3, 3, 1] <- exp(pars[3])
  
  diag(model$Q[(3+m):(2+(k*2)+m),(3+m):(2+(k*2)+m), 1]) <- exp(pars[3+m])
  model$H[1,1,1] <- exp(pars[4+m])
  
  model
}

mod_ucm_final <- SSModel(y_ucm_final ~ SSMtrend(1, NA) + SSMseasonal(7, NA, "dummy") + SSMseasonal(365, NA, "trig", harmonics = 1:14), H = NA)
vary <- var(y_ucm_final, na.rm = TRUE)
init <- numeric(4)
init[1] <- log(vary) # log-var(dist.eta)
init[2] <- log(vary/10) # log-var(dist.seas) dummy
init[3] <- log(vary/10) # log-var(dist.seas) trig
init[4] <- log(vary)  # log-var(err.oss)

mod_ucm_final$P1inf <- mod_ucm_final$P1inf * 0 
mod_ucm_final$a1[1] <- mean(y_ucm_final, na.rm = TRUE) 
diag(mod_ucm_final$P1) <- vary 
fit_final <- fitSSM(mod_ucm_final, init, updt_for, update_args = c(k = 14, m = 0))

smo_final <- KFS(fit_final$model, smoothing = c("state","mean"))
print(paste0("MAPE on training set ", mean(abs(smo_final$muhat[1:(length(y))] - as.numeric(y_box))/as.numeric(y_box)) * 100))
## [1] "MAPE on training set 6.83152740363415"
plot(smo_final$muhat[(length(y_box)+1):(length(y_ucm_final))], type = "l",col = "blue")

Modello ricorrente GRU

validation_scaled = dataset_scaled[(train_size-365):len(dataset),:]
validationX_scaled, validationY_scaled = create_window(validation_scaled, look_back)
validationX_scaled = np.reshape(validationX_scaled, (validationX_scaled.shape[0], 1, validationX_scaled.shape[1]))


model_final = GRU_model(look_back)
## Model: "sequential_2"
## _________________________________________________________________
## Layer (type)                 Output Shape              Param #   
## =================================================================
## gru_3 (GRU)                  (None, 1, 64)             82560     
## _________________________________________________________________
## gru_4 (GRU)                  (None, 128)               74112     
## _________________________________________________________________
## dense_2 (Dense)              (None, 1)                 129       
## =================================================================
## Total params: 156,801
## Trainable params: 156,801
## Non-trainable params: 0
## _________________________________________________________________
## None
model_final.load_weights("csv_weights/model_GRU_stacked_valid.h5")
#model_final.load_weights("model_GRU_stacked.h5")
#model_final.fit(validationX_scaled, validationY_scaled, epochs=4, batch_size=1, verbose=0)
# make predictions
trainPredict_final_scaled = model_final.predict(trainX_scaled, batch_size=batch_size)
trainPredict_final = scaler.inverse_transform(trainPredict_final_scaled)

# invert predictions
trainY = scaler.inverse_transform(trainY_scaled)

trainScore_final = mean_absolute_percentage_error(trainY, trainPredict_final)
print('Train Score: %.2f MAPE' % (trainScore_final))
## Train Score: 13.40 MAPE
# make predictions
validPredict_scaled = model_final.predict(validationX_scaled, batch_size=batch_size)
validPredict_final = scaler.inverse_transform(validPredict_scaled)

# invert predictions
# i dati di validation sono stati passati al modello per il training finale
validationY = scaler.inverse_transform(validationY_scaled)
validScore_final = mean_absolute_percentage_error(validationY, validPredict_final)
print('Train Score: %.2f MAPE' % (validScore_final))
## Train Score: 7.27 MAPE

testX = validationX_scaled[len(validationX_scaled)-1]
testX = np.append(testX[0], trainY_scaled[len(validationY)-1][0])
test_pred = []

for i in range(334):
  test_i = testX[-365:].reshape(1,1,-1)
  prediction = model_final.predict(test_i, batch_size=batch_size)
  testX = np.append(test_i, prediction)
  test_pred.append(prediction)

testPredict = scaler.inverse_transform(np.array(test_pred).reshape(-1,1))
plot(py$testPredict, type = "l", col="red")

pred_arima <- as.numeric(pred5_reg_final$mean)
pred_ucm <- smo_final$muhat[(length(y)+1):(length(y_ucm_final))]
pred_gru <- as.numeric(py$testPredict)

pred_arima_inv <- InvBoxCox(pred_arima, lambda, biasadj = TRUE, fvar = var(pred_arima))
pred_ucm_inv <- InvBoxCox(pred_ucm, lambda, biasadj = TRUE, fvar = var(pred_ucm))
pred_gru_inv <- InvBoxCox(pred_gru, lambda, biasadj = TRUE, fvar = var(pred_gru))

test_data <- dm$Date[(length(y_train_num)+1):(length(y_train_num)+334)]
pred_test <- cbind(test_data,pred_arima_inv, pred_ucm_inv,pred_gru_inv)
colnames(pred_test) <- c("Data","ARIMA","UCM","GRU")
write.csv(pred_test, "csv_weights/SDMTSA_847160.csv",)